Aim

In more than 100 samples, 35,500 reporters for 86 TFs were transfected into various cells (mainly mES, K562 & HepG2), and stimulated with various compounds. To be more precise, the data consists of two different libraries, library 1, which is more focused on mES biology, and library 2, which is more focused on classical signaling pathways. All the barcode counts from the 7 different sequencing runs are combined in this analysis. In a previous script (mt20211021_barcode_preprocessing… .Rmd), sanity checks were be carried out to filter out samples with bad quality, and barcode counts were be normalized to compute reporter activitites. In this script, plots will be generated to characterize the TF reporter activities and models will be built to extract the importance of reporter features.

Description of input data

  • library = with which library was this sample transfected (1, 2, or 1+2)
  • nchar = barcode length (12 for library 1, 13 for library 2)
  • commercial_reporter = positive control reporter sequence from commercial source
  • neg_ctrls = reporter with mutated TF binding sites?
  • hPGK = positive control, chunk from hPGK promoter (only in lib 1)
  • native_enhancer = genomic mES enhancer sequence chunks from Miguels libraries (only lib 1)
  • activity = rpm + pseudocount cDNA / rpm + pseudocount pDNA
  • mean_activity_sample = mean of activity per unique reporter per sample (individually per replicate)
  • reporter_activity = mean activity per unique reporter per condition (mean of replicates)
  • minP_activity = activity relative to mean of minimal promoter only reporters of that sample
  • mean_activity_sample_minP = mean of minP_activity per unique reporter
  • reporter_activity_minP = mean of mean_activity_sample_minP per condition
  • nfya_activity = activity relative to mean of nfya reporters (constitutive positive control) of that sample
  • mean_activity_sample_nfya = mean of nfya_activity per unique reporter
  • reporter_activity_nfya = mean of mean_activity_sample_nfya per condition
  • mean_activity_sample_neg = mean_activity_sample relative to its paired negative control (identical reporter with mutated binding sites)
  • reporter_activity_neg = mean of mean_activity_sample_neg per condition
  • pval_minP = p-value of t-test comparing activity distributions of individual reporters with the distribution of minimal promoter only reporters within the same condition
  • pval_neg = p-value of t-test comparing activity distributions of individual reporters with the distribution of the activities of its paired negative controls within the same condition

Setup

Libraries

knitr::opts_chunk$set(echo = TRUE)
StartTime <-Sys.time()

# 8-digit Date tag:
Date <- substr(gsub("-","",Sys.time()),1,8) 
# libraries:
library(RColorBrewer)
library(ggplot2)
library(dplyr)
library(maditr)
library(tibble)
library(pheatmap)
library(ggpubr)
library(ggbeeswarm)
library(ggforce)
library(viridis)
library(plyr)
library(cowplot)
library(gridExtra)
library(pROC)
library(tidyr)
library(stringr)
library(randomForest)

Functions

### Custom functions
SetFileName <- function(filename, initials) {
  # Set filename with extension and initials to make filename with date integrated.
  filename <- substitute(filename)
  initials <- substitute(initials)
  filename <- paste0(initials, Date, filename)
  filename
}


# Set custom ggplot2 theme and custom colors
theme_classic_lines <- function() {
  theme_pubr(border = F, legend = "top") +
            theme(panel.grid.major = element_line(colour = "#adb5bd", size = 0.25),
                  strip.background = element_rect(fill = "#ced4da")
            )
}

theme_classic_lines_45 <- function() {
  theme_pubr(border = T, legend = "top", x.text.angle = 45) +
            theme(panel.grid.major = element_line(colour = "#adb5bd", size = 0.25),
                  strip.background = element_rect(fill = "#ced4da")
            )
}

theme_classic_lines_90 <- function() {
  theme_pubr(border = T, legend = "top", x.text.angle = 90) +
            theme(panel.grid.major = element_line(colour = "#adb5bd", size = 0.25),
                  strip.background = element_rect(fill = "#ced4da")
            )
}

theme_set(theme_classic_lines())

colors_diverse <- c("#264653", "#9AC1AE", "#5D987B", "#f2cc8f", "#e76f51")

ggplot_custom <- function(...) ggplot2::ggplot(...) + 
  scale_color_manual(values = colors_diverse) + 
  scale_fill_manual(values = colors_diverse)

Loading data

# Import processed bc counts from the preprocessing step
cDNA_df <- read.csv("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/gcf6578_gen2_stimulation-2/results/mt20211118_reporter_activity_filt_combined.csv", header = T) %>%
  dplyr::select(-tf_activity, -reporter_id2, 'pval_minP' = pval, -pval_adj, -mutated_activity)

# Filter TP53 from library 1+2 K562 data (we previously determined that we should filter these data)
cDNA_df2 <- cDNA_df %>%
  filter(library == "1+2" & condition == "K562_DMSO" & tf == "TP53")
cDNA_df <- anti_join(cDNA_df, cDNA_df2, by = c("reporter_id", "sample_id"))

Plotting reporter activities

Reporter activity compared to background

Display reporter activities relative to background activity per tf and condition

# Calculate fdr from the previously calculated p-values to correct for multiple hypotheses
cDNA_df$pval_adj_minP <- p.adjust(cDNA_df$pval_minP, method = 'fdr')

# Plot the reporter activities over the minimal promoter only and the adjusted p-value for each reporter
for (i in unique(cDNA_df$condition)) {
  
  p <- ggplot_custom(cDNA_df %>%
                filter(condition == i, neg_ctrls == "No") %>%
                dplyr::select(reporter_id, pval_adj_minP, reporter_activity_minP, promoter) %>%
                unique(),
            aes(x = log2(reporter_activity_minP), y = -log10(pval_adj_minP),
                color = ifelse(log2(reporter_activity_minP) > 1 & pval_adj_minP < 0.05, "green", 
                               ifelse(log2(reporter_activity_minP) < -1 & pval_adj_minP < 0.05, "yellow","black")))) +
    geom_point() +
    geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
    geom_vline(xintercept = log2(2), linetype = "dashed") +
    geom_vline(xintercept = -log2(2), linetype = "dashed") +
    facet_wrap(~promoter) +
    ggtitle(paste('reporter activity over minimal promoter only, condition:', i))
  
  print(p)
}

# Same plot for one condition only
ggplot_custom(cDNA_df %>%
                filter(condition == "HepG2_CDCA", neg_ctrls == "No") %>%
                dplyr::select(reporter_id, pval_adj_minP, reporter_activity_minP, promoter) %>%
                unique() %>%
                mutate(reporter_activity_minP = log2(reporter_activity_minP),
                       pval_adj_minP = -log10(pval_adj_minP)),
            aes(x = reporter_activity_minP, y = pval_adj_minP, 
                color = ifelse(reporter_activity_minP > 1 & pval_adj_minP > -log10(0.05), ifelse(reporter_activity_minP > 3 & pval_adj_minP > -log10(0.05), "green", "blue"), 
                               ifelse(reporter_activity_minP < -1 & pval_adj_minP > -log10(0.05), "yellow","black")))) +
  geom_point() +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  facet_wrap(~promoter) + 
  ggtitle('reporter activity over minimal promoter only, HepG2_CDCA')

# Same plot for one condition, but now plotting reporter_activity_neg and its significance
ggplot_custom(cDNA_df %>%
                filter(condition == "HepG2_CDCA", neg_ctrls == "No", library == "1+2") %>%
                dplyr::select(reporter_id, pval_neg, reporter_activity_neg, promoter) %>%
                mutate(pval_neg = p.adjust(pval_neg, method = 'fdr')) %>%
                unique(),
            aes(x = log2(reporter_activity_neg), y = -log10(pval_neg), 
                color = ifelse(log2(reporter_activity_neg) > 1 & pval_neg < 0.05, "green", 
                               ifelse(log2(reporter_activity_neg) < -1 & pval_neg < 0.05, "yellow","black")))) +
  geom_point() +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  geom_vline(xintercept = log2(2), linetype = "dashed") +
  geom_vline(xintercept = -log2(2), linetype = "dashed") +
  facet_wrap(~promoter) + 
  ggtitle('reporter activity over paired negative control, HepG2_CDCA')
## Warning: Removed 884 rows containing missing values (geom_point).


Calculate fraction of active/repressive reporters

# Stratify reporters by their activity
cDNA_df$reporter_activity_class <- 0
cDNA_df$reporter_activity_class[cDNA_df$pval_adj_minP < 0.05 & cDNA_df$reporter_activity_minP > 2] <- 1 # significantly active = significant 2-fold increase
cDNA_df$reporter_activity_class[cDNA_df$pval_adj_minP < 0.05 & cDNA_df$reporter_activity_minP > 8] <- 2 # significantly highly active = significant 8-fold increase
cDNA_df$reporter_activity_class[cDNA_df$pval_adj_minP < 0.05 & cDNA_df$reporter_activity_minP < 0.5] <- -1 # significantly repressive = significant 2-fold decrease

## Sub-select relevant data
cDNA_df2 <- cDNA_df %>%
                filter(neg_ctrls == "No", commercial_reporter == "No") %>%
                dplyr::select(reporter_id, tf, reporter_activity_class, condition, promoter) %>%
                unique()

# Calculate per condition how many reporters per TF are upregulated, downregulated, or unchanged
cDNA_df2 <- cDNA_df2 %>%
  mutate(class_count = ave(reporter_activity_class, tf, promoter, condition, FUN = function(x) sum(x)))


for (i in unique(cDNA_df2$condition)) {
  p <- ggplot_custom(cDNA_df2 %>%
                       filter(condition == i),
                     aes(x = reorder(tf, -class_count), fill = factor(reporter_activity_class, levels = c("0", "1", "2", "-1")))) +
    geom_bar(position = "fill") +
    geom_hline(yintercept = 0.25, linetype = "dashed") +
    geom_hline(yintercept = 0.75, linetype = "dashed") +
    theme_classic_lines_90() +
    facet_wrap(~promoter) +
    xlab("TF") +
    ggtitle(paste('fraction of active/repressive reporters, condition:',i))
  
  print(p)
}

## Same plot but for one condition
ggplot_custom(cDNA_df2 %>%
                       filter(condition == "mES_2i_LIF", promoter == "mCMV"),
                     aes(x = reorder(tf, -class_count), fill = factor(reporter_activity_class, levels = c("0", "1", "2", "-1")))) +
    geom_bar(position = "fill") +
    geom_hline(yintercept = 0.25, linetype = "dashed") +
    geom_hline(yintercept = 0.75, linetype = "dashed") +
    theme_classic_lines_90()+
    xlab("TF") +
    ggtitle('fraction of active/repressive reporters, mES_2i_LIF - mCMV')


Fraction of active reporters per TF

How many reporters are classified as active/repressive per tf and condition?

# Divide TFs per condition into different classes: activators, selective activator, inactive, selective repressor, repressor

## Determine amount of active/repressive reporters per tf
cDNA_df3 <- cDNA_df2 %>%
  mutate(class_count2 = ave(reporter_activity_class, tf, condition, FUN = function(x) length(x)),
         class_count_rel = ave(reporter_activity_class, reporter_activity_class, tf, condition, FUN = function(x) length(x))) %>%
  mutate(class_count_rel = class_count_rel / class_count2) %>%
  dplyr::select(tf, condition, class_count_rel, reporter_activity_class) %>%
  unique() %>%
  spread(reporter_activity_class, class_count_rel)

cDNA_df3$class[cDNA_df3$`0` >= 0.75] <- 0 # inactive = >75% of the reporters are inactive
cDNA_df3$class[cDNA_df3$`1` >= 0.75] <- 2 # active = >75% of the reporters are active
cDNA_df3$class[cDNA_df3$`2` >= 0.75] <- 4 # highly active = >75% of the reporters are highly active
cDNA_df3$class[cDNA_df3$`-1` >= 0.75] <- -4 # repressive = >75% of the reporters are repressive
cDNA_df3$class[cDNA_df3$`1` >= 0.25 & cDNA_df3$`1` < 0.75] <- 1 # partly active = >25% & <75% of the reporters are active
cDNA_df3$class[cDNA_df3$`1` < 0.25 & cDNA_df3$`2` < 0.25 & cDNA_df3$`1` + cDNA_df3$`2` >= 0.25] <- 1 # partly active = >25% of the reporters are active or highly active
cDNA_df3$class[cDNA_df3$`2` >= 0.25 & cDNA_df3$`2` < 0.75] <- 3 # partly highly active = >25% of the reporters are highly active
cDNA_df3$class[cDNA_df3$`-1` >= 0.25 & cDNA_df3$`-1` < 0.75] <- -2 # partly repressive = >25% & <75% of the reporters are repressive
cDNA_df3$class[cDNA_df3$`1` >= 0.085 & cDNA_df3$`-1` >= 0.085] <- -1 # mixed active/repressive = > 8.5% are active and >8.5% are repressive


## Sub-select relevant columns, for now only focus on the conditions where I have robust data from both libraries
cDNA_df4 <- cDNA_df3 %>%
  dplyr::select(tf, condition, class) %>%
  unique() %>%
  na.omit() %>%
  filter(condition %in% c("HepG2_CDCA", "HepG2_DMSO", "K562_DMSO", "mES_2i", 
                          "mES_2i_LIF", "mES_HQ", "mES_LIF_CH", "mES_LIF_PD", 
                          "mES_N2B27")) %>%
  mutate(sum = ave(class, tf, FUN = function(x) sum(x))) %>%
  filter(sum != 0) %>%
  dplyr::select(-sum)

cDNA_df4 <- cDNA_df4 %>%
  spread(tf, class) %>%
  column_to_rownames('condition')

## Make custom color palette
makeColorRampPalette <- function(colors, cutoff.fraction, num.colors.in.palette)
{
  stopifnot(length(colors) == 4)
  ramp1 <- colorRampPalette(colors[1:2])(num.colors.in.palette * cutoff.fraction)
  ramp2 <- colorRampPalette(colors[2:4])(num.colors.in.palette * (1 - cutoff.fraction))
  return(c(ramp1, ramp2))
}

cols <- makeColorRampPalette(c(colors_diverse[4], colors_diverse[1], colors_diverse[2], colors_diverse[3]), 
                             1-(4/6),
                             100)
## Plot heatmap showing fraction of active reporters
pheatmap(as.matrix(cDNA_df4),
         color = cols,
         border_color = "white", 
         cellwidth = 8, cellheight = 8,
         angle_col = 90,
         main = "fraction of active/repressive reporters per tf")


Reporter feature modeling

Linear model to fit classifications

Why are the reporters classified as repressive/activating? Fit linear model to reporter classification

# Sub-select relevant data
cDNA_df_lm <- cDNA_df %>%
  filter(neg_ctrls == "No", commercial_reporter == "No", promoter != "Random",
         condition %in% c("HepG2_CDCA", "HepG2_DMSO", "K562_DMSO", "mES_2i", 
                          "mES_2i_LIF", "mES_HQ", "mES_LIF_CH", "mES_LIF_PD", 
                          "mES_N2B27"
                          ),
         tf %in% names(cDNA_df4)) %>%
  mutate(background = as.factor(background)) %>%
  dplyr::select(reporter_id, tf, reporter_activity_class, condition, promoter, spacing, background, distance) %>%
  unique()

cDNA_df3[is.na(cDNA_df3)] <- 0

# Only select tfs for which differences in reporter activity were observed (for the other ones it doesn't make sense to fit models)
mixed_classes <- cDNA_df3 %>%
  mutate(id = paste(condition, tf, sep = "_")) %>%
  dplyr::select(-class) %>%
  #filter(class != 0) %>%
  unique() %>%
  filter(`1` < 0.95, `0` < 0.95, `2` < 0.95)

# Select only relevant tfs from source data frame and only include ids with more than two entries
cDNA_df_lm$id <- paste(cDNA_df_lm$condition, cDNA_df_lm$tf, sep = "_")
cDNA_df_lm <- cDNA_df_lm %>%
  mutate(sum = ave(id, id, FUN = function(x) length(x))) %>%
  filter(sum >= 2, id %in% mixed_classes$id) %>%
  dplyr::select(-sum)


# Fit linear model for each id (combination of condition and tf)
lm_features <- lapply(unique(cDNA_df_lm$id), function(x) lm(reporter_activity_class ~ promoter + spacing + background + distance, 
                                                                     cDNA_df_lm[cDNA_df_lm$id == x,]))

names(lm_features) <- unique(cDNA_df_lm$id)


## Extract and plot weights from linear model
lm_weights <- list()
for (i in names(lm_features)) {
  lm_weights[[i]] <- lm_features[[i]]$coefficients
}

lm_weights <- bind_rows(lm_weights, .id = "id")

lm_weights <- lm_weights %>%
  dplyr::select(-`(Intercept)`) %>%
  pivot_longer(-id, names_to = "feature", values_to = "weight")

ggplot_custom(lm_weights, aes(x = feature, y = weight)) +
  geom_quasirandom() +
  theme_classic_lines_90() +
  ggtitle('weights of all models')
## Warning: Removed 279 rows containing missing values (position_quasirandom).

## Extract and plot features from linear model
exp_features <- list()
for (i in names(lm_features)) {
  exp_features[[i]] <- anova(lm_features[[i]]) %>%
  dplyr::select('sum_sq' = `Sum Sq`) %>%
    na.omit() %>%
  mutate(sum = sum(sum_sq)) %>%
  mutate(rel_sum_sq = (sum_sq/sum)*100) %>%
  setnames("rel_sum_sq", "percent") %>%
  dplyr::select("percent")
}
exp_features <- bind_rows(exp_features, .id = "id") %>%
  rownames_to_column("feature") %>%
  mutate(feature = gsub("...[0-9]{1-3}", "", feature)) %>%
  mutate(feature = gsub("[0-9]", "", feature)) %>%
  mutate(percent = format(percent, scientific = F, digits = 0)) %>%
  mutate(percent = as.numeric(percent))

exp_features$group <- "no residual"
exp_features$group[grep("Residual", exp_features$feature)] <- "residual"
exp_features$percent[exp_features$feature == "Residuals"] <- 100 - exp_features$percent[exp_features$feature == "Residuals"]
exp_features$feature <- revalue(exp_features$feature, c("Residuals"="total"))

ggplot_custom(exp_features, 
       aes(x = percent, y = reorder(feature, -percent), fill = group)) +
  geom_bar(stat = "identity") + 
  xlab("Variance explained (%)") + 
  ylab("") +
  ggtitle('sum of features of all models')

## Plot features separately per tf activity class
classes <- cDNA_df3 %>%
  mutate(id = paste(condition, tf, sep = "_")) %>%
  dplyr::select(id, class) %>%
  unique()

exp_features <- merge(exp_features, classes, by = "id")

ggplot_custom(exp_features, 
       aes(y = percent, x = feature, color = group)) +
  geom_quasirandom() + 
  xlab("Variance explained (%)") + 
  ylab("") +
  facet_wrap(~class) +
  theme_classic_lines_90() +
  ggtitle('features of all models per tf activity class')

## Discard poor performing linear models (<50% of variance explained)
exp_features_good <- exp_features %>%
  filter(group == "residual", percent >= 50)
exp_features_sel <- exp_features[exp_features$id %in% exp_features_good$id,]

exp_features_sel <- exp_features_sel %>%
  mutate(tf = gsub(".*_(.*)", "\\1", id),
         condition = gsub("(.*)_.*", "\\1", id))

## Plot again the features
ggplot_custom(exp_features_sel, 
       aes(y = percent, x = feature, color = group)) +
  geom_quasirandom() + 
  xlab("Variance explained (%)") + 
  ylab("") +
  ggtitle('features of all models - only good models')

## Plot features individually per id
### Order the ids
order <- exp_features_sel %>%
                mutate(sum = ave(percent, id, FUN = function(x) sum(x))) %>%
  dplyr::select(sum, tf) %>%
  unique() %>%
  group_by(tf) %>%
  top_n(1, sum)

exp_features_sel_ord <- merge(order, exp_features_sel, by = "tf") %>%
  filter(feature != "total")
  

### As bar plot
ggplot(exp_features_sel_ord %>%
         filter(condition %in% c("K562_DMSO", "HepG2_DMSO", "mES_2i_LIF")), 
       aes(y = percent, x = reorder(id, -sum), fill = feature, group = feature)) +
  geom_bar(stat = "identity", position = "stack") + 
  xlab("Variance explained (%)") + 
  ylab("") +
  scale_fill_manual(values = c("#e07a5f", "#3d405b", "#81b29a", "#f2cc8f"))+
  theme_classic_lines_90() +
  ggtitle('feature importance per tf & condition')

### As connected line-plot
ggplot_custom(exp_features_sel, aes(x=feature, y=percent, color=group, group = id)) + 
  geom_line() +
  geom_point()+
  ggtitle('feature importance per tf & condition')

### Only select the three main cell lines to simplify
exp_features_sel_heatmap <- exp_features_sel %>%
  filter(condition %in% c("mES_2i_LIF", "K562_DMSO", "HepG2_DMSO")) %>%
  mutate(cell_type = gsub("(.*)_.*", "\\1", condition)) %>%
  mutate(cell_type = gsub("(.*)_.*", "\\1", cell_type)) %>%
  dplyr::select(-group, -condition) %>%
  filter(feature != "total") %>%
  spread(feature, percent) %>% 
  column_to_rownames("id") #%>%
  #arrange(tf)

cell_type <- exp_features_sel_heatmap %>%
  dplyr::select(cell_type)

colors_pheatmap <- list('cell_type' = c("K562" = colors_diverse[1], "HepG2" = colors_diverse[5], "mES" = colors_diverse[4]))

pheatmap(as.matrix(t(exp_features_sel_heatmap %>% dplyr::select(-cell_type, -tf, -class))),
         color = colorRampPalette(brewer.pal(n = 7, name = "Greys"))(100),
         border_color = "white", 
         cellwidth = 8, cellheight = 8,
         angle_col = 90, annotation_col = cell_type, cluster_cols = T,
         annotation_colors = colors_pheatmap,
         main = 'feature importance per tf & condition')


Comparison of TF reporters between conditions

Can we select the best reporter for each TF?

# Select conditions for which I have robust data, remove highly active reporters that I took over from library 1 and included in library 2 (because those distort the activity distributions)
cDNA_df2 <- cDNA_df %>%
  filter(condition %in% c("HepG2_DMSO", "K562_DMSO", "mES_2i_LIF", "mES_2i", "mES_N2B27", "mES_LIF_PD", "mES_LIF_CH"), library != "1+2")
tfs_lib1 <- cDNA_df %>%
  filter(library == "1") %>%
  dplyr::select(tf) %>% 
  unique()
  
cDNA_df3 <- cDNA_df %>%
  filter(condition %in% c("HepG2_DMSO", "K562_DMSO", "mES_2i_LIF", "mES_2i", "mES_N2B27", "mES_LIF_PD", "mES_LIF_CH"), library == "1+2")

cDNA_df3 <- cDNA_df3[!cDNA_df3$tf %in% tfs_lib1$tf,]

cDNA_df<- rbind(cDNA_df2, cDNA_df3)



# Explore the ids for which I was able to fit a good linear model
ids_selected <- cDNA_df_lm %>%
  filter(id %in% rownames(exp_features_sel_heatmap)) %>%
  mutate(reporter_activity_class = as.character(reporter_activity_class),
         sum_all = ave(id, id, FUN = function(x) length(x)),
         sum_class = ave(id, id, reporter_activity_class, FUN = function(x) length(x))) %>%
  mutate(sum_all = as.numeric(sum_all),
         sum_class = as.numeric(sum_class),
         frac = sum_class / sum_all)

## Compare for some example tfs the activity fraction in different conditions
reporter_correlations <- cDNA_df %>%
  filter(tf %in% c("GRHL1", "POU5F1::SOX2", "MAF::NFE2"),
         condition %in% c("mES_2i_LIF", "K562_DMSO", "HepG2_DMSO")) %>%
  dplyr::select(reporter_id, condition, tf, pval_adj_minP) %>%
  mutate(pval_adj_minP = -log10(pval_adj_minP)) %>%
  unique() %>%
  spread(condition, pval_adj_minP)

ggplot_custom(reporter_correlations, aes(x = K562_DMSO, y = mES_2i_LIF)) +
  geom_point() +
  facet_wrap(~tf, scales = "free")


Compare reporter activities to other measures

Comparison with TF expression and MPRA-inferred TF activities (Ernst et al.)

# Extract TF activities for K562 & HepG2
tf_activities <- cDNA_df %>%
  filter(condition %in% c("K562_DMSO", "HepG2_DMSO"), neg_ctrls == "No") %>%
  dplyr::select(reporter_id, reporter_activity_minP, condition, tf) %>%
  unique() %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  spread(condition, reporter_activity_minP) %>%
  mutate(dif = log2(K562_DMSO / HepG2_DMSO)) %>%
  dplyr::select(tf, dif, reporter_id)

#write_csv2(tf_activities, "/DATA/usr/m.trauernicht/projects/SuRE-TF/ATAC_seq_hepg2/diffTF/tf_reporter_activities_all.csv")

# Correlate with RNA-seq
load('/DATA/usr/m.martinez.ara/GitLab/gurten/gurten/data/fc181121_epsure_tadec_rnaseq_expr_joshi.RData')

# remove Serum data and calculate mean expression
tib_fpkm_joshi$twoi <- (tib_fpkm_joshi$twoi_1 + tib_fpkm_joshi$twoi_2)/2
tib_fpkm_joshi <- tib_fpkm_joshi %>%
  dplyr::select(gene_id, 'tf' = symbol, twoi)

tf_activities2 <- cDNA_df %>%
  filter(condition == "mES_2i_LIF", neg_ctrls == "No") %>%
  dplyr::select(reporter_id, reporter_activity_minP, tf) %>%
  unique() %>%
  mutate(tf = gsub("_.*", "", tf),
         tf_activity = ave(reporter_activity_minP, tf, FUN = function(x) mean(x))) %>%
  dplyr::select(tf_activity, tf) %>%
  unique()

tf_activities2 <- merge(tf_activities2, tib_fpkm_joshi, by = "tf") %>%
  mutate(tf_activity = (tf_activity-mean(tf_activity))/sd(tf_activity),
         twoi = (twoi-mean(twoi))/sd(twoi))

ggplot_custom(tf_activities2, aes(x = twoi, y = tf_activity)) +
  geom_point() +
  ggtitle('TF reporter activities vs. TF expression in mESCs')

# Correlate with regulon (VIPER, Aerts)


# Correlate with TF activities from Ernst et al. (https://doi.org/10.1038/nbt.3678)
ernst_activities <- read.table("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/ernst_tf_activities_hepg2_k562.csv", sep = ";", dec = ",", header = T, fill = T)

ernst_activities <- ernst_activities %>%
  dplyr::select('tf' = TF, activity_enrichment_HepG2, activity_enrichment_K562) %>%
  mutate(avg_score_HepG2 = ave(activity_enrichment_HepG2, tf, FUN = function(x) mean(x)),
         avg_score_K562 = ave(activity_enrichment_K562, tf, FUN = function(x) mean(x))) %>%
  unique() %>%
  pivot_longer(-tf, names_to = "condition", values_to = "ernst_activity") %>%
  mutate(condition = gsub("avg_score_", "", condition))

tf_activities <- cDNA_df %>%
  filter(condition %in% c("K562_DMSO", "HepG2_DMSO"), neg_ctrls == "No") %>%
  dplyr::select(reporter_activity_minP, condition, tf) %>%
  mutate(tf = gsub("_.*", "", tf),
         condition = gsub("_DMSO", "", condition),
         reporter_activity_minP = ave(reporter_activity_minP, condition, tf, FUN = function(x) mean(x))) %>%
  unique()

ernst_activities <- merge(ernst_activities, tf_activities, by = c('tf', 'condition')) %>%
  mutate(ernst_activity = (ernst_activity-mean(ernst_activity))/sd(ernst_activity),
         reporter_activity_minP = (reporter_activity_minP-mean(reporter_activity_minP))/sd(reporter_activity_minP))

ggplot_custom(ernst_activities, 
              aes(x = ernst_activity, y = reporter_activity_minP)) +
  geom_point() +
  geom_abline(linetype = "dashed") +
  geom_smooth(method = "lm", color = "grey", fill = "lightgrey") +
  facet_wrap(~condition) +
  ggtitle('TF reporter activities vs. MPRA-chunk-inferred TF activities')
## `geom_smooth()` using formula 'y ~ x'

#write_csv2(ernst_activities, "/DATA/usr/m.trauernicht/projects/SuRE-TF/ATAC_seq_hepg2/diffTF/tf_reporter_activities_ernst.csv")

Plotting mean TF activities per condition

display actual activity values per condition and TF instead of fraction of active reporters per TF

## Prepare dataframe
# Caculate mean TF activity per condition
tf_activity_heatmap <- cDNA_df %>%
  filter(neg_ctrls == "No", commercial_reporter == "No") %>%
  mutate(tf_activity = ave(reporter_activity_minP, condition, tf, FUN = function(x) mean(x))) %>%
  #mutate(tf_activity = log2(tf_activity)) %>%
        #mutate(tf_activity = ave(tf_activity, condition, FUN = function(x) (x-mean(x))/sd(x))) %>%
  dplyr::select(tf_activity, condition, tf) %>%
  unique() %>%
  spread(condition, tf_activity) %>% 
  remove_rownames %>% 
  column_to_rownames(var="tf")

# Keeping the scale in the pheatmap function
myBreaks1 <- seq(0,6,0.06)

# Keep only TFs that are in at least one condition above a certain threshold
tf_activity_heatmap <- tf_activity_heatmap[rowSums(tf_activity_heatmap > 4) >= 1,] %>%
  na.omit()

#tf_activity_heatmap <- tf_activity_heatmap[,c(2,1,6,7,3,8,4,5)]

# pheatmap function
pheatmap(as.matrix(t(tf_activity_heatmap)),
         color = colorRampPalette(brewer.pal(n = 7, name = "Greys"))(100),
         breaks = myBreaks1, border_color = "black", 
         cellwidth = 8, cellheight = 8,
         angle_col = 90,
         main = 'reporter activities per tf and condition')

This heatmap doesn’t take significance into consideration and to me is less representative of tf activity than the fraction of reporter activity classes heatmap



Session Info

paste("Run time: ",format(Sys.time()-StartTime))
## [1] "Run time:  2.488774 mins"
getwd()
## [1] "/DATA/usr/m.trauernicht/projects/SuRE-TF/analyses"
date()
## [1] "Tue Dec 21 12:37:51 2021"
sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] randomForest_4.6-14 stringr_1.4.0       tidyr_1.1.3        
##  [4] pROC_1.17.0.1       gridExtra_2.3       cowplot_1.1.1      
##  [7] plyr_1.8.6          viridis_0.6.1       viridisLite_0.4.0  
## [10] ggforce_0.3.3       ggbeeswarm_0.6.0    ggpubr_0.4.0       
## [13] pheatmap_1.0.12     tibble_3.1.2        maditr_0.8.2       
## [16] dplyr_1.0.7         ggplot2_3.3.5       RColorBrewer_1.1-2 
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.0        jsonlite_1.7.2    splines_4.0.5     carData_3.0-4    
##  [5] bslib_0.2.5.1     assertthat_0.2.1  highr_0.9         vipor_0.4.5      
##  [9] cellranger_1.1.0  yaml_2.2.1        lattice_0.20-41   pillar_1.6.1     
## [13] backports_1.2.1   glue_1.4.2        digest_0.6.27     ggsignif_0.6.2   
## [17] polyclip_1.10-0   colorspace_2.0-2  Matrix_1.3-2      htmltools_0.5.1.1
## [21] pkgconfig_2.0.3   broom_0.7.8       haven_2.4.1       purrr_0.3.4      
## [25] scales_1.1.1      tweenr_1.0.2      openxlsx_4.2.4    rio_0.5.27       
## [29] mgcv_1.8-34       generics_0.1.0    farver_2.1.0      car_3.0-11       
## [33] ellipsis_0.3.2    withr_2.4.2       magrittr_2.0.1    crayon_1.4.1     
## [37] readxl_1.3.1      evaluate_0.14     fansi_0.5.0       nlme_3.1-152     
## [41] MASS_7.3-53.1     rstatix_0.7.0     forcats_0.5.1     foreign_0.8-81   
## [45] beeswarm_0.4.0    tools_4.0.5       data.table_1.14.0 hms_1.1.0        
## [49] lifecycle_1.0.0   munsell_0.5.0     zip_2.2.0         compiler_4.0.5   
## [53] jquerylib_0.1.4   rlang_0.4.11      grid_4.0.5        labeling_0.4.2   
## [57] rmarkdown_2.11    gtable_0.3.0      abind_1.4-5       DBI_1.1.1        
## [61] curl_4.3.2        R6_2.5.0          knitr_1.33        utf8_1.2.1       
## [65] stringi_1.7.2     Rcpp_1.0.7        vctrs_0.3.8       tidyselect_1.1.1 
## [69] xfun_0.24